library(here) # for reproducible paths
library(SingleCellExperiment)
library(scater) # For qc
library(org.Mm.eg.db) # To annotate the genenames
sce <- readRDS(here("processed", "sce.RDS"))
We start by removing genes not expressed in any cell
genes_beforeqc <- dim(sce)[1]
keep_feature <- rowSums(counts(sce) > 0) > 0
sce <- sce[keep_feature, ]
genes_beforeqc -dim(sce)[1]
## [1] 6511
6511 cells are not expressed in any cell.
First are sorted the genenames and gene symbols, because the default ensembl notation is not very handy. And then save the mitochondrial genes as such.
if(!file.exists(here("processed", "sce_preliminary.RDS"))){
# obtain full genenames
genename <- mapIds(org.Mm.eg.db, keys = rownames(sce), keytype = "ENSEMBL", column = c("GENENAME"))
# Use the symbols as rownames
# first make gene names unique
# TODO: save duplicate gene name list
symb_unique <- uniquifyFeatureNames(rownames(sce), rowData(sce)[, "Symbol"])
# Now they can be used as rownames
rownames(sce) <- symb_unique
# Add full gene names and the uniuqe symbols to the rowdata
rowData(sce)$symb_uniq <- symb_unique
rowData(sce)$gene_name <- genename
# Subset the mitochondrial genes
is_mito <- grepl("^mt-", rownames(sce))
}else{
sce <- readRDS(here("processed", "sce_preliminary.RDS"))
}
Then we can use the scater package to add the quality per cell ## First try log for low thresholds but not for high. In low this prevents negative thresholds
if(!file.exists(here("processed", "sce_preliminary.RDS"))){
sce <- addPerCellQC(sce, subsets = list(mt=is_mito))
# Automated outlier detection
outlier_lib_low <- isOutlier(sce$total, log = TRUE, type = "lower")
outlier_expr_low <-
isOutlier(sce$detected, log = TRUE, type = "lower")
outlier_lib_high <- isOutlier(sce$total, type = "higher")
outlier_expr_high <-
isOutlier(sce$detected, type = "higher")
outlier_mt <- isOutlier(sce$subsets_mt_percent, type = "higher")
# total
outlier <-
outlier_lib_low |
outlier_expr_low |
outlier_lib_high | outlier_expr_high | outlier_mt
# See how many cells are detected as outliers
DataFrame(
lib_size_high = sum(outlier_lib_high),
expression_high = sum(outlier_expr_high),
lib_size_low = sum(outlier_lib_low),
expression_low = sum(outlier_expr_low),
mt_pct = sum(outlier_mt),
total = sum(outlier)
)
# And the thresholds
attr(outlier_lib_high, "thresholds")
attr(outlier_expr_high, "thresholds")
attr(outlier_lib_low, "thresholds")
attr(outlier_expr_low, "thresholds")
attr(outlier_mt, "thresholds")
# TODO do the analysis separating by batch?
# Add if it is an outlier to the metadata
sce$outlier <- outlier
}else{
sce <- readRDS(here("processed", "sce_preliminary.RDS"))
}
Diagnostic plots to visualize the data distribution. The orange cells are marked as outliers by the automatic detection from scater. The thresholds are not very useful, so these will poblably be ignored.
plotColData(sce, x="Sample", y="sum", colour_by="outlier") +
ggtitle("Total count") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
plotColData(sce, x="Sample", y="sum", colour_by="outlier") +
scale_y_log10() + ggtitle("Total count log scale") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
plotColData(sce, x="Sample", y="detected", colour_by="outlier") +
scale_y_log10() + ggtitle("Detected Genes") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
plotColData(sce, x="Sample", y="sum", colour_by="chip") +
scale_y_log10() + ggtitle("total count by batch") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
plotColData(sce, x="Sample", y="subsets_mt_percent", colour_by="outlier") + ggtitle("Mitocchondrial percentatge") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
hist(
sce$total,
breaks = 100
)
hist(
sce$detected,
breaks = 100
)
hist(
sce$subsets_mt_percent,
breaks = 100
)
### Scatter plots
plotColData(sce, x="sum", y="subsets_mt_percent", colour_by="outlier")
plotColData(sce, x="sum", y="detected", colour_by="outlier")
plotColData(sce, x="sum", y="detected", colour_by="Sample")
Here we run a PCA using the information in the metadata instead of the gene expression. It is useful to visualize the QC parametres.
sce <- runColDataPCA(sce, variables = c("sum","detected", "subsets_mt_percent"))
plotReducedDim(sce, dimred="PCA_coldata", colour_by = "Sample")
sce$chip <- as.character(sce$chip)
plotReducedDim(sce, dimred="PCA_coldata", colour_by = "chip")
This measures will probably very useful to get rid of these cells that have low gene counts but a relatively high umi count. The isoutlier function did not work very well with the sum of umi and the detected genes. This function expects roughly a normal distribution. Bellow we try with the ratio between these two parametres, that do follow a normal distribution.
if(!file.exists(here("processed", "sce_preliminary.RDS"))){
sce$ratio_detected_sum <- sce$detected/sce$sum
sce$outlier_ratio <- isOutlier(sce$ratio_detected_sum, type = "both")
# Save the object at this stage with the label "preliminary analysis"
# As only annotation addition has been done, without deleting anything yet.
saveRDS(sce, here("processed", "sce_preliminary.RDS"))
}else{
sce <- readRDS(here("processed", "sce_preliminary.RDS"))
}
summary(sce$ratio_detected_sum)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.005707 0.326323 0.395047 0.404313 0.476884 0.878004
plotColData(sce, x="sum", y="detected", colour_by="ratio_detected_sum")
# Plot an histogram with the ratio between umi and gene counts
hist(
sce$ratio_detected_sum,
breaks = 100
)
# Use the is outlier function from scater to see the cutoffs suggestions
plotColData(sce, x="sum", y="detected", colour_by="outlier_ratio")
attr(sce$outlier_ratio, "thresholds")
## lower higher
## 0.0649170 0.7251773
The suggestions here are very reasonable, it is not enough to delete all the cells we do not trust but we can definitely take this into consideration as a threshold parametre.
plotColData(sce, x="Sample", y="sum", colour_by="outlier_ratio") +
ggtitle("Total count") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
plotColData(sce, x="Sample", y="sum", colour_by="outlier") +
scale_y_log10() + ggtitle("Total count log scale") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
plotColData(sce, x="Sample", y="detected", colour_by="outlier_ratio") +
scale_y_log10() + ggtitle("Detected Genes") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
plotColData(sce, x="sum", y="subsets_mt_percent", colour_by="outlier_ratio")
plotColData(sce, x="sum", y="detected", colour_by="outlier_ratio")
Here we run a PCA using the information in the metadata instead of the gene expression. It is useful to visualize the QC parametres.
plotReducedDim(sce, dimred="PCA_coldata", colour_by = "outlier_ratio")